References
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# Special
!pip install featuretools[complete]
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
# update modules
!pip uninstall xgboost
!pip install -U xgboost
print('Environment: Google Colab')
from search import HyperbandSearchCV
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
sns.set()
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics as skmetrics
# special
import imblearn
import featuretools as ft
# warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%load_ext watermark
%watermark -iv
<frozen importlib._bootstrap>:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
2020-12-20 12:03:28,960 featuretools - WARNING Featuretools failed to load plugin nlp_primitives from library nlp_primitives. For a full stack trace, set logging to debug. pandas : 1.1.5 imblearn : 0.7.0 sklearn : 0.23.2 numpy : 1.19.4 seaborn : 0.11.0 joblib : 1.0.0 matplotlib : 3.3.3 sys : 3.8.5 (default, Sep 4 2020, 02:22:02) [Clang 10.0.0 ] json : 2.0.9 featuretools : 0.22.0 plotly_express: 0.4.1 autopep8 : 1.5.4
<frozen importlib._bootstrap>:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
| 1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
| 5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
| 5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
import plotly_express as px
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,width=300,height=200)
df_train['customerID'].nunique() == len(df_train)
True
def clean_data(dfx):
dfx = dfx.copy()
# keep customerid for index feature.
# from eda we see that gender has no effect
cols_drop = ['gender']
dfx = dfx.drop(cols_drop,axis=1)
# impute
dfx['TotalCharges'] = pd.to_numeric(dfx['TotalCharges'],
errors='coerce').fillna(0)
return dfx
df_train = clean_data(df_train)
df_test = clean_data(df_test)
df_Xtrain = df_train.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0, 'Yes':1})
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytest = df_test[target_name].map({'No':0, 'Yes':1})
from featuretools import variable_types as vtypes
show_methods(vtypes)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | Boolean | FilePath | PandasTypes | api |
| 1 | Categorical | FullName | PhoneNumber | camel_to_snake |
| 2 | ClassNameDescriptor | IPAddress | SubRegionCode | find_variable_types |
| 3 | CountryCode | Id | Text | graph_variable_types |
| 4 | DEFAULT_DTYPE_VALUES | Index | TimeIndex | list_variable_types |
| 5 | DateOfBirth | LatLong | Timedelta | np |
| 6 | Datetime | NaturalLanguage | URL | pd |
| 7 | DatetimeTimeIndex | Numeric | Unknown | utils |
| 8 | Discrete | NumericTimeIndex | Variable | variable |
| 9 | EmailAddress | Ordinal | ZIPCode | warnings |
df_Xtrain.head(2)
| customerID | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1621-YNCJH | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 |
| 1 | 7143-BQIBA | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 |
# df_train[cols_obj].apply(lambda x: pd.Series.unique(x))
cols_obj = df_train.select_dtypes('object').columns.tolist()
df_train[cols_obj].apply(lambda x: pd.Series.nunique(x)).sort_values()
Partner 2 Dependents 2 PhoneService 2 PaperlessBilling 2 Churn 2 MultipleLines 3 InternetService 3 OnlineSecurity 3 OnlineBackup 3 DeviceProtection 3 TechSupport 3 StreamingTV 3 StreamingMovies 3 Contract 3 PaymentMethod 4 customerID 5634 dtype: int64
# customer id is index
# other columns have very low cardinality, we can take ordinal or ohe.
cols_cat = [i for i in cols_obj if i not in ['customerID',target_name]]
print(cols_cat)
['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
cols_num = df_train.select_dtypes('number').columns.tolist()
cols_num
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
cols_num = [i for i in cols_num if i not in ['SeniorCitizen']]
cols_cat += ['SeniorCitizen']
features = cols_cat + cols_num
def get_fm(dfx,cols_num=cols_num,cols_cat=cols_cat,
index='customerID'
):
dic_cat = {i:vtypes.Categorical for i in cols_cat}
dic_num = {i:vtypes.Numeric for i in cols_num}
all_variable_types = {**dic_cat, **dic_num}
es = ft.EntitySet("data")
es.entity_from_dataframe(entity_id="data",
dataframe=dfx,
index=index,
time_index=None,
variable_types=all_variable_types)
new_entity_id="SeniorCitizen"
es.normalize_entity(base_entity_id="data",
new_entity_id=new_entity_id,
index=new_entity_id
)
# Adding This gave me worse result.
# new_entity_id="Dependents"
# es.normalize_entity(base_entity_id="data",
# new_entity_id=new_entity_id,
# index=new_entity_id
# )
trans_primitives = [
'divide_numeric', # cross multiply all numeric features (not others)
]
feature_matrix, features = ft.dfs(entityset=es,
target_entity="data",
trans_primitives=trans_primitives,
drop_exact=[],
verbose=True
)
df_out = feature_matrix
cols_cat = list(df_out.select_dtypes('object').columns)
df_out = pd.get_dummies(df_out,columns=cols_cat,drop_first=False)
return df_out
df_Xtrain_new = get_fm(df_Xtrain)
df_Xtrain_new.head(2)
Built 71 features Elapsed: 00:00 | Progress: 100%|██████████
| SeniorCitizen | tenure | MonthlyCharges | TotalCharges | MonthlyCharges / TotalCharges | MonthlyCharges / tenure | TotalCharges / MonthlyCharges | TotalCharges / tenure | tenure / MonthlyCharges | tenure / TotalCharges | SeniorCitizen.COUNT(data) | SeniorCitizen.MAX(data.MonthlyCharges) | SeniorCitizen.MAX(data.TotalCharges) | SeniorCitizen.MAX(data.tenure) | SeniorCitizen.MEAN(data.MonthlyCharges) | SeniorCitizen.MEAN(data.TotalCharges) | SeniorCitizen.MEAN(data.tenure) | SeniorCitizen.MIN(data.MonthlyCharges) | SeniorCitizen.MIN(data.TotalCharges) | SeniorCitizen.MIN(data.tenure) | SeniorCitizen.NUM_UNIQUE(data.Contract) | SeniorCitizen.NUM_UNIQUE(data.Dependents) | SeniorCitizen.NUM_UNIQUE(data.DeviceProtection) | SeniorCitizen.NUM_UNIQUE(data.InternetService) | SeniorCitizen.NUM_UNIQUE(data.MultipleLines) | SeniorCitizen.NUM_UNIQUE(data.OnlineBackup) | SeniorCitizen.NUM_UNIQUE(data.OnlineSecurity) | SeniorCitizen.NUM_UNIQUE(data.PaperlessBilling) | SeniorCitizen.NUM_UNIQUE(data.Partner) | SeniorCitizen.NUM_UNIQUE(data.PaymentMethod) | SeniorCitizen.NUM_UNIQUE(data.PhoneService) | SeniorCitizen.NUM_UNIQUE(data.StreamingMovies) | SeniorCitizen.NUM_UNIQUE(data.StreamingTV) | SeniorCitizen.NUM_UNIQUE(data.TechSupport) | SeniorCitizen.SKEW(data.MonthlyCharges) | SeniorCitizen.SKEW(data.TotalCharges) | SeniorCitizen.SKEW(data.tenure) | SeniorCitizen.STD(data.MonthlyCharges) | SeniorCitizen.STD(data.TotalCharges) | SeniorCitizen.STD(data.tenure) | SeniorCitizen.SUM(data.MonthlyCharges) | SeniorCitizen.SUM(data.TotalCharges) | SeniorCitizen.SUM(data.tenure) | Partner_No | Partner_Yes | Dependents_No | Dependents_Yes | PhoneService_No | PhoneService_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaperlessBilling_No | PaperlessBilling_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SeniorCitizen.MODE(data.Contract)_Month-to-month | SeniorCitizen.MODE(data.Dependents)_No | SeniorCitizen.MODE(data.DeviceProtection)_No | SeniorCitizen.MODE(data.InternetService)_Fiber optic | SeniorCitizen.MODE(data.MultipleLines)_No | SeniorCitizen.MODE(data.MultipleLines)_Yes | SeniorCitizen.MODE(data.OnlineBackup)_No | SeniorCitizen.MODE(data.OnlineSecurity)_No | SeniorCitizen.MODE(data.PaperlessBilling)_Yes | SeniorCitizen.MODE(data.Partner)_No | SeniorCitizen.MODE(data.PaymentMethod)_Electronic check | SeniorCitizen.MODE(data.PhoneService)_Yes | SeniorCitizen.MODE(data.StreamingMovies)_No | SeniorCitizen.MODE(data.StreamingMovies)_Yes | SeniorCitizen.MODE(data.StreamingTV)_No | SeniorCitizen.MODE(data.StreamingTV)_Yes | SeniorCitizen.MODE(data.TechSupport)_No | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| customerID | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 1621-YNCJH | 0 | 36 | 106.05 | 3834.40 | 0.027658 | 2.945833 | 36.156530 | 106.511111 | 0.339463 | 0.009389 | 4743 | 118.75 | 8672.45 | 72 | 61.71498 | 2166.606304 | 32.078431 | 18.25 | 0.0 | 0 | 3 | 2 | 3 | 3 | 3 | 3 | 3 | 2 | 2 | 4 | 2 | 3 | 3 | 3 | -0.088151 | 1.037776 | 0.255781 | 30.247656 | 2221.937336 | 24.622527 | 292714.15 | 10276213.7 | 152148 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
| 7143-BQIBA | 0 | 10 | 62.25 | 612.95 | 0.101558 | 6.225000 | 9.846586 | 61.295000 | 0.160643 | 0.016315 | 4743 | 118.75 | 8672.45 | 72 | 61.71498 | 2166.606304 | 32.078431 | 18.25 | 0.0 | 0 | 3 | 2 | 3 | 3 | 3 | 3 | 3 | 2 | 2 | 4 | 2 | 3 | 3 | 3 | -0.088151 | 1.037776 | 0.255781 | 30.247656 | 2221.937336 | 24.622527 | 292714.15 | 10276213.7 | 152148 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
df_Xtest_new = get_fm(df_Xtest)
df_Xtest_new.head(2)
Built 71 features Elapsed: 00:00 | Progress: 100%|██████████
| SeniorCitizen | tenure | MonthlyCharges | TotalCharges | MonthlyCharges / TotalCharges | MonthlyCharges / tenure | TotalCharges / MonthlyCharges | TotalCharges / tenure | tenure / MonthlyCharges | tenure / TotalCharges | SeniorCitizen.COUNT(data) | SeniorCitizen.MAX(data.MonthlyCharges) | SeniorCitizen.MAX(data.TotalCharges) | SeniorCitizen.MAX(data.tenure) | SeniorCitizen.MEAN(data.MonthlyCharges) | SeniorCitizen.MEAN(data.TotalCharges) | SeniorCitizen.MEAN(data.tenure) | SeniorCitizen.MIN(data.MonthlyCharges) | SeniorCitizen.MIN(data.TotalCharges) | SeniorCitizen.MIN(data.tenure) | SeniorCitizen.NUM_UNIQUE(data.Contract) | SeniorCitizen.NUM_UNIQUE(data.Dependents) | SeniorCitizen.NUM_UNIQUE(data.DeviceProtection) | SeniorCitizen.NUM_UNIQUE(data.InternetService) | SeniorCitizen.NUM_UNIQUE(data.MultipleLines) | SeniorCitizen.NUM_UNIQUE(data.OnlineBackup) | SeniorCitizen.NUM_UNIQUE(data.OnlineSecurity) | SeniorCitizen.NUM_UNIQUE(data.PaperlessBilling) | SeniorCitizen.NUM_UNIQUE(data.Partner) | SeniorCitizen.NUM_UNIQUE(data.PaymentMethod) | SeniorCitizen.NUM_UNIQUE(data.PhoneService) | SeniorCitizen.NUM_UNIQUE(data.StreamingMovies) | SeniorCitizen.NUM_UNIQUE(data.StreamingTV) | SeniorCitizen.NUM_UNIQUE(data.TechSupport) | SeniorCitizen.SKEW(data.MonthlyCharges) | SeniorCitizen.SKEW(data.TotalCharges) | SeniorCitizen.SKEW(data.tenure) | SeniorCitizen.STD(data.MonthlyCharges) | SeniorCitizen.STD(data.TotalCharges) | SeniorCitizen.STD(data.tenure) | SeniorCitizen.SUM(data.MonthlyCharges) | SeniorCitizen.SUM(data.TotalCharges) | SeniorCitizen.SUM(data.tenure) | Partner_No | Partner_Yes | Dependents_No | Dependents_Yes | PhoneService_No | PhoneService_Yes | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaperlessBilling_No | PaperlessBilling_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | SeniorCitizen.MODE(data.Contract)_Month-to-month | SeniorCitizen.MODE(data.Dependents)_No | SeniorCitizen.MODE(data.DeviceProtection)_No | SeniorCitizen.MODE(data.InternetService)_Fiber optic | SeniorCitizen.MODE(data.MultipleLines)_No | SeniorCitizen.MODE(data.MultipleLines)_Yes | SeniorCitizen.MODE(data.OnlineBackup)_No | SeniorCitizen.MODE(data.OnlineSecurity)_No | SeniorCitizen.MODE(data.PaperlessBilling)_Yes | SeniorCitizen.MODE(data.Partner)_No | SeniorCitizen.MODE(data.Partner)_Yes | SeniorCitizen.MODE(data.PaymentMethod)_Electronic check | SeniorCitizen.MODE(data.PhoneService)_Yes | SeniorCitizen.MODE(data.StreamingMovies)_No | SeniorCitizen.MODE(data.StreamingMovies)_Yes | SeniorCitizen.MODE(data.StreamingTV)_No | SeniorCitizen.MODE(data.StreamingTV)_Yes | SeniorCitizen.MODE(data.TechSupport)_No | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| customerID | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 1794-HBQTJ | 0 | 1 | 48.6 | 48.6 | 1.000000 | 48.600000 | 1.00000 | 48.600000 | 0.020576 | 0.020576 | 1158 | 118.60 | 8684.80 | 72 | 62.389983 | 2219.692358 | 32.658031 | 18.7 | 0.00 | 0 | 3 | 2 | 3 | 3 | 3 | 3 | 3 | 2 | 2 | 4 | 2 | 3 | 3 | 3 | -0.070659 | 1.072957 | 0.228395 | 30.601714 | 2278.817867 | 24.658823 | 72247.60 | 2570403.75 | 37818 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
| 0356-OBMAC | 1 | 56 | 99.9 | 5706.3 | 0.017507 | 1.783929 | 57.12012 | 101.898214 | 0.560561 | 0.009814 | 251 | 117.35 | 8436.25 | 72 | 79.820120 | 2727.039641 | 32.314741 | 19.2 | 19.45 | 1 | 3 | 2 | 3 | 3 | 3 | 3 | 3 | 2 | 2 | 4 | 2 | 3 | 3 | 3 | -0.974773 | 0.663470 | 0.237100 | 22.771121 | 2395.833841 | 24.655882 | 20034.85 | 684486.95 | 8111 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 1 |
def post_process_fm(fm,thr_miss=0.95,thr_corr=0.95):
"""Post process feature matrix.
1. remove duplicated features
2. remove features having many missing features
3. remvoe zero variance features
4. remove high collinear features
"""
# Remove duplicated features
start_features = fm.shape[1]
fm = fm.iloc[:, ~fm.columns.duplicated()]
n_dups = start_features - fm.shape[1]
print(f'There were {n_dups} duplicated features.')
fm = fm.replace({np.inf: np.nan, -np.inf:np.nan})
# Remove the ids and labels
idname = 'index'
targetname = 'price'
cols_drop_id = [ i for i in fm.columns if idname in i]
cols_drop_target = [ i for i in fm.columns if targetname in i]
cols_drop_id_target = cols_drop_id + cols_drop_target
print('Dropping ids and label: ', cols_drop_id_target)
fm = fm.drop(cols_drop_id_target,axis=1)
# One hot encoding (if necessary)
fm = pd.get_dummies(fm)
n_features_start = fm.shape[1]
print('Original shape: ', fm.shape)
# Find missing and percentage
df_miss = pd.DataFrame(fm.isnull().sum())
df_miss['frac'] = df_miss[0] / fm.shape[0]
df_miss.sort_values('frac', ascending = False, inplace = True)
# Missing above threshold
cols_miss = list(df_miss[df_miss['frac'] > thr_miss].index)
n_cols_miss = len(cols_miss)
# Remove missing columns
fm = fm[[i for i in fm if i not in cols_miss]]
print('{} missing columns with threshold: {}.'.format(
n_cols_miss, thr_miss))
# Zero variance
df_unq_ct = pd.DataFrame(fm.nunique()).sort_values(0,ascending=True)
cols_zero_var = list(df_unq_ct[df_unq_ct[0] == 1].index)
n_cols_zero_var = len(cols_zero_var)
# Remove zero variance columns
fm = fm[[i for i in fm if i not in cols_zero_var]]
print('{} zero variance columns.'.format(n_cols_zero_var))
# Correlations
df_corr = fm.corr()
# Extract the upper triangle of the correlation matrix
df_upper = df_corr.where(np.triu(np.ones(df_corr.shape), k = 1).astype(np.bool))
# Select the features with correlations above the threshold
# Need to use the absolute value
cols_drop = [col for col in df_upper.columns
if any(df_upper[col].abs() > thr_corr)]
n_collinear = len(cols_drop)
fm = fm[[i for i in fm if i not in cols_drop]]
print('{} collinear columns removed with correlation above {}.'.format(
n_collinear, thr_corr))
n_total_cols_removed = n_dups + n_cols_miss + n_cols_zero_var + n_collinear
print('Total columns removed: ', n_total_cols_removed)
print('Shape after feature selection: {}.'.format(fm.shape))
return fm
df_Xtrain_good = post_process_fm(df_Xtrain_new,thr_miss=0.9,thr_corr=0.9)
df_Xtest_good = post_process_fm(df_Xtest_new,thr_miss=0.9,thr_corr=0.9)
There were 0 duplicated features. Dropping ids and label: [] Original shape: (5634, 99) 0 missing columns with threshold: 0.9. 26 zero variance columns. 39 collinear columns removed with correlation above 0.9. Total columns removed: 65 Shape after feature selection: (5634, 34). There were 0 duplicated features. Dropping ids and label: [] Original shape: (1409, 100) 0 missing columns with threshold: 0.9. 25 zero variance columns. 41 collinear columns removed with correlation above 0.9. Total columns removed: 66 Shape after feature selection: (1409, 34).
# if some features are not common in train test, exclude them.
cols_exclude = np.setdiff1d(df_Xtest_good.columns,df_Xtest_good.columns)
cols_exclude
array([], dtype=object)
Xtr = df_Xtrain_good.fillna(0)
ytr = np.array(ser_ytrain)
Xtx = df_Xtest_good.fillna(0)
ytx = np.array(ser_ytest)
# Gives Worse result
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit(Xtr)
# Xtr = scaler.transform(Xtr)
# Xtx = scaler.transform(Xtx)
from xgboost import XGBClassifier
model = XGBClassifier(random_state=SEED,subsample=0.9,max_depth=3)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
skmetrics.confusion_matrix(ytx, ypreds)
/Users/poudel/opt/miniconda3/envs/ft/lib/python3.8/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
[12:07:42] WARNING: /Users/runner/miniforge3/conda-bld/xgboost_1607604592557/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
array([[921, 114],
[169, 205]])
# XGBClassifier?
from sklearn.linear_model import LogisticRegression
params_fixed = {'dual': False,
'random_state': SEED,
'n_jobs': 1
}
params_best = {'C': 0.42679058013626753, 'max_iter': 1000,
'penalty': 'l2', 'solver': 'lbfgs'}
params = params_fixed
params.update(params_best)
model = LogisticRegression(**params)
model.fit(Xtr, ytr)
ypreds = model.predict(Xtx)
skmetrics.confusion_matrix(ytx, ypreds)
array([[925, 110],
[168, 206]])
ytest = np.array(ser_ytest)
yprobs2d = model.predict_proba(Xtx)
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
model_eval_bin('LR',ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support
0 0.85 0.89 0.87 1035
1 0.65 0.55 0.60 374
accuracy 0.80 1409
macro avg 0.75 0.72 0.73 1409
weighted avg 0.79 0.80 0.80 1409
[[925 110]
[168 206]]
| Accuracy | Precision | Recall | F1-score | AUC | |
|---|---|---|---|---|---|
| LR | 0.8027 | 0.6519 | 0.5508 | 0.5971 | 0.7223 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 5 secs